Objectives:
Packages:
library(tidyverse)
library(sf)
library(leaflet)
library(tmap)
Pinot noir reviews scraped from Wine Enthusiast (scraped, compiled and cleaned for Kaggle by user zackthoutt at https://www.kaggle.com/zynicide/wine-reviews).
wine <- read_csv("wine_data.csv")
wine_new <- wine %>%
select(country, province, winery, region_1, points, price) %>%
rename(state = province) %>%
filter(state == "California" | state == "Oregon" | state == "Washington") %>%
mutate(ppd = round(points/price,2)) %>% # points per dollar!
arrange(-ppd)
wine_summary <- wine_new %>%
group_by(state) %>%
summarize(
mean_ppp = mean(ppd, na.rm = TRUE),
median_ppp = median(ppd, na.rm = TRUE),
min = min(ppd, na.rm = TRUE),
count = length(state)
)
wine_summary
## # A tibble: 3 x 5
## state mean_ppp median_ppp min count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 California 2.43 2.02 0.63 6896
## 2 Oregon 2.42 2.12 0.34 2786
## 3 Washington 3.35 2.84 1.4 34
wine_hists <- ggplot(wine_new, aes(x = ppd)) +
geom_histogram(aes(fill = state), alpha = 0.7) +
scale_fill_manual(values = c("royalblue", "mediumorchid1","orange")) +
facet_wrap(~state, scales = "free")
wine_hists
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
wine_points <- ggplot(wine_new, aes(x = price, y = points)) +
geom_point(aes(color = state), alpha = 0.1) +
geom_smooth(color = "black", method = "lm", size = 0.4, se = FALSE) +
scale_color_manual(values = c("royalblue","mediumorchid1","orange")) +
theme_light() +
labs(x = "Price", y = "Points") +
theme(legend.position = "NA") +
facet_wrap(~state)
wine_points
Data: CA.gov California Open Data Portal: https://data.ca.gov/dataset/ca-geographic-boundaries
File types (see more at https://www.census.gov/geo/maps-data/data/tiger-line.html):
Attribute variables we’ll use:
ca_counties <- st_read(dsn = ".", layer = "CA_Counties_TIGER2016")
A really cool thing about the sf package is that geometries are sticky - that means that we basically get to work with spatial attributes like a normal tibble/data frame, but the geometries (spatial information) stick to it.
ca_land <- ca_counties %>%
select(NAME, ALAND)
# plot(ca_land)
# Read pop/income data, then make sure county names column matches
ca_pop_inc <- read_csv("ca_pop_inc.csv") %>%
rename(NAME = COUNTY)
# Join the two:
ca_df <- full_join(ca_land, ca_pop_inc) %>%
select(NAME, MedFamilyIncome)
# Make a map:
ca_income <- ggplot(ca_df) +
geom_sf(aes(fill = MedFamilyIncome), color = "white", size = 0.2) +
scale_fill_gradientn(colors = c("blue","mediumorchid1","orange")) +
theme_minimal()
ca_income
# Show that this doesn't work:
#leaflet(ca_df) %>%
# addPolygons()
# Oh no, the projection is wrong! We need it to match the projection that leaflet uses (WGS84)
ca_df_transform <- st_transform(ca_df, crs = 4326)
# Now try that again...
leaflet(ca_df_transform) %>%
addTiles() %>% # Adds bg
addPolygons(weight = 1.0,
opacity = 1.0,
color = "white",
fillOpacity = 0.5,
fillColor = ~colorQuantile("YlOrRd", MedFamilyIncome)(MedFamilyIncome)
) # Adds polygons
tmap_mode("view")
## tmap mode set to interactive viewing
## tmap mode set to interactive viewing
# if (Sys.getenv("USER") != "CRAN")
tm_shape(ca_df_transform) + tm_fill("MedFamilyIncome", alpha = 0.5)